library(skimr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Introduction

The data we use is comming from a competive tetris ladder https://ch.tetr.io/ The data was webscraped via the api https://ch.tetr.io/api/users/by/league?limit=50&after=23731.327577078657%3A0%3A1e-10

The Dataset

After scraping we are now ready to look into the data:

Loading the CSV

#load Tetra League data
data = read.csv("tldata.csv")

head(data)

Preparing the data

#make some things factors
data$Country           = as.factor(data$Country)

data$Country = sub("Korea, Republic of", "Republic of Korea", data$Country)
data$Country = sub("Venezuela, Bolivarian Republic of", "Republic of Venezuela", data$Country)
data$Country = sub("Macedonia, the former Yugoslav Republic of", "Republic of Macedonia", data$Country)

data$Rank              = factor(data$Rank, levels=c("D","D+","C-","C","C+","B-","B","B+","A-","A","A+","S-","S","S+","SS","U","X","X+"))

data$Active.This.Week = as.factor(data$Active.This.Week)
data$Active.This.Week = ifelse(data$Active.This.Week == "Yes", 1, 0)

data$Supporter.Status. = as.factor(data$Supporter.Status.)
data$Supporter.Status. = ifelse(data$Supporter.Status. == "Yes", 1, 0)

data$Wins = as.numeric(data$Wins)
data$Games.Played = as.numeric(data$Games.Played)

data$Username = as.character(data$Username)


#remove index (standing does this)
data$X = NULL
data

Explanatory Data Analysis

EDA 1.0 Looking into the data

Dimensions

#Data Dimensions
dim(data) 
## [1] 39769    16

Summary

#Data Summary
summary(data)
##     Standing       Username           Country               Wins       
##  Min.   :    1   Length:39769       Length:39769       Min.   :   0.0  
##  1st Qu.: 9943   Class :character   Class :character   1st Qu.:  31.0  
##  Median :19885   Mode  :character   Mode  :character   Median :  83.0  
##  Mean   :19885                                         Mean   : 158.7  
##  3rd Qu.:29827                                         3rd Qu.: 202.0  
##  Max.   :39769                                         Max.   :4001.0  
##                                                                        
##   Games.Played       Winrate            APM              PPS       
##  Min.   :  10.0   Min.   :0.0000   Min.   :  1.05   Min.   :0.300  
##  1st Qu.:  63.0   1st Qu.:0.4844   1st Qu.: 15.11   1st Qu.:0.940  
##  Median : 159.0   Median :0.5087   Median : 23.10   Median :1.170  
##  Mean   : 311.2   Mean   :0.4951   Mean   : 30.56   Mean   :1.259  
##  3rd Qu.: 389.0   3rd Qu.:0.5327   3rd Qu.: 38.13   3rd Qu.:1.480  
##  Max.   :8142.0   Max.   :1.0000   Max.   :227.68   Max.   :4.270  
##                                                                    
##        VS         Glicko.Rating  Rating.Deviation  Tetra.Rating     
##  Min.   :  1.75   Min.   : 265   Min.   : 60.00   Min.   :   11.47  
##  1st Qu.: 32.44   1st Qu.:1168   1st Qu.: 62.00   1st Qu.: 4531.12  
##  Median : 49.86   Median :1479   Median : 72.00   Median : 9509.57  
##  Mean   : 64.76   Mean   :1496   Mean   : 74.94   Mean   : 9862.82  
##  3rd Qu.: 81.44   3rd Qu.:1774   3rd Qu.: 86.00   3rd Qu.:14693.09  
##  Max.   :438.21   Max.   :4276   Max.   :100.00   Max.   :24752.28  
##                                                                     
##       Rank       Active.This.Week Supporter.Status.  RankColour       
##  B      : 3182   Min.   :0.0000   Min.   :0.00000   Length:39769      
##  A-     : 3182   1st Qu.:0.0000   1st Qu.:0.00000   Class :character  
##  A+     : 3182   Median :1.0000   Median :0.00000   Mode  :character  
##  B-     : 3181   Mean   :0.5722   Mean   :0.02721                     
##  B+     : 3181   3rd Qu.:1.0000   3rd Qu.:0.00000                     
##  A      : 3181   Max.   :1.0000   Max.   :1.00000                     
##  (Other):20680
#list features
names(data)
##  [1] "Standing"          "Username"          "Country"          
##  [4] "Wins"              "Games.Played"      "Winrate"          
##  [7] "APM"               "PPS"               "VS"               
## [10] "Glicko.Rating"     "Rating.Deviation"  "Tetra.Rating"     
## [13] "Rank"              "Active.This.Week"  "Supporter.Status."
## [16] "RankColour"

Structure & data types

str(data)
## 'data.frame':    39769 obs. of  16 variables:
##  $ Standing         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Username         : chr  "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
##  $ Country          : chr  "Japan" "United States" "Republic of Korea" "Japan" ...
##  $ Wins             : num  1026 347 511 358 320 ...
##  $ Games.Played     : num  1233 394 670 437 454 ...
##  $ Winrate          : num  0.832 0.881 0.763 0.819 0.705 ...
##  $ APM              : num  228 213 195 203 191 ...
##  $ PPS              : num  4.27 3.92 3.39 3.83 3.42 3.84 3.44 3.56 3.04 3.84 ...
##  $ VS               : num  438 421 389 394 391 ...
##  $ Glicko.Rating    : int  4276 4026 3963 3944 3931 3899 3862 3854 3853 3767 ...
##  $ Rating.Deviation : int  85 71 72 76 72 81 67 65 86 68 ...
##  $ Tetra.Rating     : num  24752 24641 24601 24591 24579 ...
##  $ Rank             : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 18 18 18 18 18 18 ...
##  $ Active.This.Week : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ Supporter.Status.: num  1 1 0 1 1 1 0 1 1 0 ...
##  $ RankColour       : chr  "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
sapply(data, class)
##          Standing          Username           Country              Wins 
##         "integer"       "character"       "character"         "numeric" 
##      Games.Played           Winrate               APM               PPS 
##         "numeric"         "numeric"         "numeric"         "numeric" 
##                VS     Glicko.Rating  Rating.Deviation      Tetra.Rating 
##         "numeric"         "integer"         "integer"         "numeric" 
##              Rank  Active.This.Week Supporter.Status.        RankColour 
##          "factor"         "numeric"         "numeric"       "character"

Basic summaries

# numeric variables
num_vars  <- sapply(data, is.numeric)
summary(data[ , num_vars])   # min–max, quartiles, mean
##     Standing          Wins         Games.Played       Winrate      
##  Min.   :    1   Min.   :   0.0   Min.   :  10.0   Min.   :0.0000  
##  1st Qu.: 9943   1st Qu.:  31.0   1st Qu.:  63.0   1st Qu.:0.4844  
##  Median :19885   Median :  83.0   Median : 159.0   Median :0.5087  
##  Mean   :19885   Mean   : 158.7   Mean   : 311.2   Mean   :0.4951  
##  3rd Qu.:29827   3rd Qu.: 202.0   3rd Qu.: 389.0   3rd Qu.:0.5327  
##  Max.   :39769   Max.   :4001.0   Max.   :8142.0   Max.   :1.0000  
##       APM              PPS              VS         Glicko.Rating 
##  Min.   :  1.05   Min.   :0.300   Min.   :  1.75   Min.   : 265  
##  1st Qu.: 15.11   1st Qu.:0.940   1st Qu.: 32.44   1st Qu.:1168  
##  Median : 23.10   Median :1.170   Median : 49.86   Median :1479  
##  Mean   : 30.56   Mean   :1.259   Mean   : 64.76   Mean   :1496  
##  3rd Qu.: 38.13   3rd Qu.:1.480   3rd Qu.: 81.44   3rd Qu.:1774  
##  Max.   :227.68   Max.   :4.270   Max.   :438.21   Max.   :4276  
##  Rating.Deviation  Tetra.Rating      Active.This.Week Supporter.Status.
##  Min.   : 60.00   Min.   :   11.47   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 62.00   1st Qu.: 4531.12   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 72.00   Median : 9509.57   Median :1.0000   Median :0.00000  
##  Mean   : 74.94   Mean   : 9862.82   Mean   :0.5722   Mean   :0.02721  
##  3rd Qu.: 86.00   3rd Qu.:14693.09   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :100.00   Max.   :24752.28   Max.   :1.0000   Max.   :1.00000
#drop usernames to avoid this output getting to long
data_no_username = data %>% select(-Username) 

# categorical variables
cat_vars  <- sapply(data_no_username, is.factor) | sapply(data_no_username, is.character)
lapply(data_no_username[ , cat_vars], table)     # frequency tables
## $Country
## 
##                                              
##                                          447 
##                                  Afghanistan 
##                                            2 
##                                Åland Islands 
##                                            1 
##                                      Albania 
##                                            3 
##                                      Algeria 
##                                            6 
##                               American Samoa 
##                                            1 
##                                      Andorra 
##                                            7 
##                                       Angola 
##                                            2 
##                                     Anguilla 
##                                            2 
##                                   Antarctica 
##                                           87 
##                          Antigua and Barbuda 
##                                            3 
##                                    Argentina 
##                                          497 
##                                      Armenia 
##                                            5 
##                                    Australia 
##                                         1333 
##                                      Austria 
##                                           45 
##                                   Azerbaijan 
##                                           10 
##                                      Bahamas 
##                                            3 
##                                      Bahrain 
##                                           10 
##                                   Bangladesh 
##                                           14 
##                                     Barbados 
##                                            4 
##                                      Belarus 
##                                           26 
##                                      Belgium 
##                                          201 
##                                       Belize 
##                                            4 
##                                        Benin 
##                                            1 
##                                      Bermuda 
##                                            2 
##                                       Bhutan 
##                                            1 
##              Bolivia, Plurinational State of 
##                                           69 
##                       Bosnia and Herzegovina 
##                                            4 
##                                     Botswana 
##                                            1 
##                                Bouvet Island 
##                                            2 
##                                       Brazil 
##                                         1160 
##               British Indian Ocean Territory 
##                                            1 
##                            Brunei Darussalam 
##                                           44 
##                                     Bulgaria 
##                                           37 
##                                 Burkina Faso 
##                                            1 
##                                      Burundi 
##                                            1 
##                                     Cambodia 
##                                           38 
##                                       Canada 
##                                         1743 
##                               Cayman Islands 
##                                            1 
##                     Central African Republic 
##                                            1 
##                                         Chad 
##                                            2 
##                                        Chile 
##                                          399 
##                                        China 
##                                          539 
##                             Christmas Island 
##                                            5 
##                      Cocos (Keeling) Islands 
##                                            4 
##                                     Colombia 
##                                          240 
##                                        Congo 
##                                            1 
##                                 Cook Islands 
##                                            6 
##                                   Costa Rica 
##                                           37 
##                                Côte d'Ivoire 
##                                            3 
##                                      Croatia 
##                                           26 
##                                         Cuba 
##                                            3 
##                                      Curaçao 
##                                            1 
##                                       Cyprus 
##                                            4 
##                               Czech Republic 
##                                           74 
##                                      Denmark 
##                                          113 
##                                     Djibouti 
##                                            1 
##                           Dominican Republic 
##                                           25 
##                                      Ecuador 
##                                           59 
##                                        Egypt 
##                                           22 
##                                  El Salvador 
##                                           13 
##                                      England 
##                                            6 
##                            Equatorial Guinea 
##                                            1 
##                                      Eritrea 
##                                            1 
##                                      Estonia 
##                                           27 
##                                       Europe 
##                                            8 
##                                Faroe Islands 
##                                            2 
##                                         Fiji 
##                                            2 
##                                      Finland 
##                                           85 
##                                       France 
##                                          414 
##                                French Guiana 
##                                            3 
##                  French Southern Territories 
##                                            2 
##                                        Gabon 
##                                            1 
##                                       Gambia 
##                                            1 
##                                      Georgia 
##                                           17 
##                                      Germany 
##                                          413 
##                                        Ghana 
##                                            1 
##                                    Gibraltar 
##                                            2 
##                                       Greece 
##                                           47 
##                                    Greenland 
##                                            2 
##                                      Grenada 
##                                            1 
##                                         Guam 
##                                            4 
##                                    Guatemala 
##                                           28 
##                                     Guernsey 
##                                            1 
##                                Guinea-Bissau 
##                                            1 
##                                       Guyana 
##                                            1 
##                                        Haiti 
##                                            1 
##            Heard Island and McDonald Islands 
##                                            2 
##                Holy See (Vatican City State) 
##                                           11 
##                                     Honduras 
##                                            8 
##                                    Hong Kong 
##                                          628 
##                                      Hungary 
##                                           41 
##                                      Iceland 
##                                            8 
##                                        India 
##                                          106 
##                                    Indonesia 
##                                          881 
##                    Iran, Islamic Republic of 
##                                            9 
##                                         Iraq 
##                                            5 
##                                      Ireland 
##                                           38 
##                                  Isle of Man 
##                                            3 
##                                       Israel 
##                                           74 
##                                        Italy 
##                                          212 
##                                      Jamaica 
##                                            4 
##                                        Japan 
##                                         1689 
##                                       Jordan 
##                                           13 
##                                   Kazakhstan 
##                                           79 
##                                        Kenya 
##                                            8 
##       Korea, Democratic People's Republic of 
##                                           18 
##                                       Kosovo 
##                                            1 
##                                       Kuwait 
##                                            1 
##                                   Kyrgyzstan 
##                                            6 
##             Lao People's Democratic Republic 
##                                            9 
##                                       Latvia 
##                                           43 
##                                      Lebanon 
##                                            8 
##                                      Lesotho 
##                                            2 
##                                        Libya 
##                                            2 
##                                    Lithuania 
##                                           37 
##                                   Luxembourg 
##                                            2 
##                                        Macao 
##                                           17 
##                                   Madagascar 
##                                            1 
##                                       Malawi 
##                                            1 
##                                     Malaysia 
##                                         1565 
##                                     Maldives 
##                                            3 
##                                        Malta 
##                                            6 
##                             Marshall Islands 
##                                            2 
##                                   Martinique 
##                                            3 
##                                    Mauritius 
##                                            3 
##                                      Mayotte 
##                                            3 
##                                       Mexico 
##                                          481 
##              Micronesia, Federated States of 
##                                            1 
##                         Moldova, Republic of 
##                                            6 
##                                       Monaco 
##                                            4 
##                                     Mongolia 
##                                         1154 
##                                   Montenegro 
##                                            1 
##                                      Morocco 
##                                           20 
##                                      Myanmar 
##                                            4 
##                                      Namibia 
##                                            1 
##                                        Nauru 
##                                            2 
##                                        Nepal 
##                                            8 
##                                  Netherlands 
##                                          190 
##                                New Caledonia 
##                                            2 
##                                  New Zealand 
##                                          244 
##                                    Nicaragua 
##                                            2 
##                                        Niger 
##                                            1 
##                                      Nigeria 
##                                            3 
##                                         Niue 
##                                            3 
##                               Norfolk Island 
##                                            1 
##                             Northern Ireland 
##                                            2 
##                     Northern Mariana Islands 
##                                            1 
##                                       Norway 
##                                           86 
##                                         Oman 
##                                            3 
##                                     Pakistan 
##                                           18 
##                                        Palau 
##                                            1 
##                                    Palestine 
##                                            3 
##                                       Panama 
##                                           26 
##                                     Paraguay 
##                                            9 
##                                         Peru 
##                                          381 
##                                  Philippines 
##                                         3898 
##                                       Poland 
##                                          227 
##                                     Portugal 
##                                          141 
##                                  Puerto Rico 
##                                            8 
##                                        Qatar 
##                                           13 
##                            Republic of Korea 
##                                         5287 
##                        Republic of Macedonia 
##                                            9 
##                        Republic of Venezuela 
##                                           48 
##                                      Réunion 
##                                            5 
##                                      Romania 
##                                           59 
##                           Russian Federation 
##                                          534 
##                             Saint Barthélemy 
##                                            1 
## Saint Helena, Ascension and Tristan da Cunha 
##                                            2 
##                                  Saint Lucia 
##                                            2 
##                                 Saint Martin 
##                                            1 
##                    Saint Pierre and Miquelon 
##                                            2 
##                                        Samoa 
##                                            1 
##                                   San Marino 
##                                            2 
##                                 Saudi Arabia 
##                                           29 
##                                     Scotland 
##                                            8 
##                                      Senegal 
##                                            4 
##                                       Serbia 
##                                           32 
##                                   Seychelles 
##                                            1 
##                                 Sierra Leone 
##                                            1 
##                                    Singapore 
##                                          855 
##                                     Slovakia 
##                                           28 
##                                     Slovenia 
##                                           18 
##                                      Somalia 
##                                            3 
##                                 South Africa 
##                                           34 
## South Georgia and the South Sandwich Islands 
##                                            1 
##                                        Spain 
##                                          398 
##                                    Sri Lanka 
##                                            2 
##                                        Sudan 
##                                            1 
##                                     Suriname 
##                                            1 
##               Svalbard and Jan Mayen Islands 
##                                            2 
##                                    Swaziland 
##                                            1 
##                                       Sweden 
##                                          126 
##                                  Switzerland 
##                                           53 
##                         Syrian Arab Republic 
##                                            4 
##                                       Taiwan 
##                                         2877 
##                                   Tajikistan 
##                                            1 
##                 Tanzania, United Republic of 
##                                            2 
##                                     Thailand 
##                                          184 
##                                         Togo 
##                                            2 
##                                      Tokelau 
##                                            1 
##                                        Tonga 
##                                            1 
##                          Trinidad and Tobago 
##                                            5 
##                                      Tunisia 
##                                           11 
##                                       Turkey 
##                                          244 
##                                 Turkmenistan 
##                                            1 
##                     Turks and Caicos Islands 
##                                            2 
##                                       Tuvalu 
##                                            3 
##                                      Ukraine 
##                                           79 
##                         United Arab Emirates 
##                                           56 
##                               United Kingdom 
##                                          598 
##                                United States 
##                                         6281 
##                                      Uruguay 
##                                           34 
##                    US Minor Outlying Islands 
##                                            8 
##                                   Uzbekistan 
##                                            2 
##                                      Vanuatu 
##                                            1 
##                                      Vietnam 
##                                          833 
##                      Virgin Islands, British 
##                                            2 
##                         Virgin Islands, U.S. 
##                                            7 
##                                        Wales 
##                                            2 
##                    Wallis and Futuna Islands 
##                                            2 
##                               Western Sahara 
##                                            2 
##                                        Yemen 
##                                            3 
##                                       Zambia 
##                                            1 
##                                     Zimbabwe 
##                                            3 
## 
## $Rank
## 
##    D   D+   C-    C   C+   B-    B   B+   A-    A   A+   S-    S   S+   SS    U 
##  995  994 1988 2387 2386 3181 3182 3181 3182 3181 3182 2784 2386 2386 2386 1591 
##    X   X+ 
##  318   79 
## 
## $RankColour
## 
## #1FA834 #3BB687 #46AD51 #4F64C9 #4F99C0 #552883 #5650C7 #6C496E #733E8F #79558C 
##    3182    3182    3181    3182    3181    2386    3181     994    2387    1988 
## #907591 #A763EA #B2972B #D8AF0E #DB8B1F #E0A71B #FF3813 #FF45FF 
##     995      79    2784    2386    2386    2386    1591     318
glimpse(data) 
## Rows: 39,769
## Columns: 16
## $ Standing          <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ Username          <chr> "5HAN", "CABOOZLED_PIE", "TURTLE", "SYAKEGOHAN", "VI…
## $ Country           <chr> "Japan", "United States", "Republic of Korea", "Japa…
## $ Wins              <dbl> 1026, 347, 511, 358, 320, 270, 179, 323, 164, 356, 9…
## $ Games.Played      <dbl> 1233, 394, 670, 437, 454, 358, 229, 463, 206, 533, 1…
## $ Winrate           <dbl> 0.8321, 0.8807, 0.7627, 0.8192, 0.7048, 0.7542, 0.78…
## $ APM               <dbl> 227.68, 213.34, 194.60, 202.60, 190.93, 203.54, 194.…
## $ PPS               <dbl> 4.27, 3.92, 3.39, 3.83, 3.42, 3.84, 3.44, 3.56, 3.04…
## $ VS                <dbl> 438.21, 420.66, 388.93, 393.83, 391.01, 400.73, 388.…
## $ Glicko.Rating     <int> 4276, 4026, 3963, 3944, 3931, 3899, 3862, 3854, 3853…
## $ Rating.Deviation  <int> 85, 71, 72, 76, 72, 81, 67, 65, 86, 68, 65, 64, 75, …
## $ Tetra.Rating      <dbl> 24752.28, 24640.67, 24601.05, 24591.11, 24579.48, 24…
## $ Rank              <fct> X+, X+, X+, X+, X+, X+, X+, X+, X+, X+, X+, X+, X+, …
## $ Active.This.Week  <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1…
## $ Supporter.Status. <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1…
## $ RankColour        <chr> "#A763EA", "#A763EA", "#A763EA", "#A763EA", "#A763EA…
skim(data)
Data summary
Name data
Number of rows 39769
Number of columns 16
_______________________
Column type frequency:
character 3
factor 1
numeric 12
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Username 0 1 1 16 0 39764 0
Country 0 1 0 44 447 225 0
RankColour 0 1 7 7 0 18 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Rank 0 1 FALSE 18 B: 3182, A-: 3182, A+: 3182, B-: 3181

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Standing 0 1 19885.00 11480.47 1.00 9943.00 19885.00 29827.00 39769.00 ▇▇▇▇▇
Wins 0 1 158.71 212.86 0.00 31.00 83.00 202.00 4001.00 ▇▁▁▁▁
Games.Played 0 1 311.19 422.82 10.00 63.00 159.00 389.00 8142.00 ▇▁▁▁▁
Winrate 0 1 0.50 0.09 0.00 0.48 0.51 0.53 1.00 ▁▁▇▁▁
APM 0 1 30.56 23.13 1.05 15.11 23.10 38.13 227.68 ▇▂▁▁▁
PPS 0 1 1.26 0.45 0.30 0.94 1.17 1.48 4.27 ▇▇▂▁▁
VS 0 1 64.76 47.78 1.75 32.44 49.86 81.44 438.21 ▇▂▁▁▁
Glicko.Rating 0 1 1496.23 448.56 265.00 1168.00 1479.00 1774.00 4276.00 ▂▇▂▁▁
Rating.Deviation 0 1 74.94 12.80 60.00 62.00 72.00 86.00 100.00 ▇▃▂▂▂
Tetra.Rating 0 1 9862.82 6100.94 11.47 4531.12 9509.57 14693.09 24752.28 ▇▇▇▅▂
Active.This.Week 0 1 0.57 0.49 0.00 0.00 1.00 1.00 1.00 ▆▁▁▁▇
Supporter.Status. 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁

EDA 1.1 Missing Value Scan

Missing-value scan

colSums(is.na(data))
##          Standing          Username           Country              Wins 
##                 0                 0                 0                 0 
##      Games.Played           Winrate               APM               PPS 
##                 0                 0                 0                 0 
##                VS     Glicko.Rating  Rating.Deviation      Tetra.Rating 
##                 0                 0                 0                 0 
##              Rank  Active.This.Week Supporter.Status.        RankColour 
##                 0                 0                 0                 0
# Optional: percentage
round(colMeans(is.na(data)) * 100, 2)
##          Standing          Username           Country              Wins 
##                 0                 0                 0                 0 
##      Games.Played           Winrate               APM               PPS 
##                 0                 0                 0                 0 
##                VS     Glicko.Rating  Rating.Deviation      Tetra.Rating 
##                 0                 0                 0                 0 
##              Rank  Active.This.Week Supporter.Status.        RankColour 
##                 0                 0                 0                 0

EDA 1.2 Duplicate rows / keys

#duplicates = list()
#k = 1
#
#for (i in 1:(nrow(data) - 1)) {
#  for (j in (i + 1):nrow(data)) {
#    if (data$Username[i] == data$Username[j]) {
#      duplicates[[k]] = c(i, j)
#      k = k + 1
#    }
#  }
#}
sum(duplicated(data))
## [1] 0
# If Username should be unique:
sum(duplicated(data$Username))
## [1] 5
#which(duplicated(data$Username))

#data$Username[c(11030, 11178, 11429, 12178, 25056)]
#data[c(11030, 11178, 11429, 12178, 25056), ]
data %>%
  group_by(Username) %>%
  filter(n() > 1)  
summary(data)
##     Standing       Username           Country               Wins       
##  Min.   :    1   Length:39769       Length:39769       Min.   :   0.0  
##  1st Qu.: 9943   Class :character   Class :character   1st Qu.:  31.0  
##  Median :19885   Mode  :character   Mode  :character   Median :  83.0  
##  Mean   :19885                                         Mean   : 158.7  
##  3rd Qu.:29827                                         3rd Qu.: 202.0  
##  Max.   :39769                                         Max.   :4001.0  
##                                                                        
##   Games.Played       Winrate            APM              PPS       
##  Min.   :  10.0   Min.   :0.0000   Min.   :  1.05   Min.   :0.300  
##  1st Qu.:  63.0   1st Qu.:0.4844   1st Qu.: 15.11   1st Qu.:0.940  
##  Median : 159.0   Median :0.5087   Median : 23.10   Median :1.170  
##  Mean   : 311.2   Mean   :0.4951   Mean   : 30.56   Mean   :1.259  
##  3rd Qu.: 389.0   3rd Qu.:0.5327   3rd Qu.: 38.13   3rd Qu.:1.480  
##  Max.   :8142.0   Max.   :1.0000   Max.   :227.68   Max.   :4.270  
##                                                                    
##        VS         Glicko.Rating  Rating.Deviation  Tetra.Rating     
##  Min.   :  1.75   Min.   : 265   Min.   : 60.00   Min.   :   11.47  
##  1st Qu.: 32.44   1st Qu.:1168   1st Qu.: 62.00   1st Qu.: 4531.12  
##  Median : 49.86   Median :1479   Median : 72.00   Median : 9509.57  
##  Mean   : 64.76   Mean   :1496   Mean   : 74.94   Mean   : 9862.82  
##  3rd Qu.: 81.44   3rd Qu.:1774   3rd Qu.: 86.00   3rd Qu.:14693.09  
##  Max.   :438.21   Max.   :4276   Max.   :100.00   Max.   :24752.28  
##                                                                     
##       Rank       Active.This.Week Supporter.Status.  RankColour       
##  B      : 3182   Min.   :0.0000   Min.   :0.00000   Length:39769      
##  A-     : 3182   1st Qu.:0.0000   1st Qu.:0.00000   Class :character  
##  A+     : 3182   Median :1.0000   Median :0.00000   Mode  :character  
##  B-     : 3181   Mean   :0.5722   Mean   :0.02721                     
##  B+     : 3181   3rd Qu.:1.0000   3rd Qu.:0.00000                     
##  A      : 3181   Max.   :1.0000   Max.   :1.00000                     
##  (Other):20680
#d1 = data[c(which(duplicated(data$Username))), 1]
#d2 = data[c(which(duplicated(data$Username, fromLast=TRUE))), 1]
#
#for (i in d1) for (j in d2) {
#  if (data$Games.Played[i] > data$Games.Played[j])
#    data = data[-j,]
#  else
#    data = data[-i,]
#}

d1 = data[c(which(duplicated(data$Username))), 1]
d2 = data[c(which(duplicated(data$Username, fromLast=TRUE))), 1]
to_remove = integer(0)

for (i in d1) {
  for (j in d2) {
    g1 = data$Games.Played[i]
    g2 = data$Games.Played[j]
    if (data$Username[i] != data$Username[j]) next
    if (g1 > g2) to_remove = c(to_remove, j)
    else if (g1 < g2) to_remove = c(to_remove, i)
  }
}

data = data[-unique(to_remove), ]

data
data %>%
  group_by(Username) %>%
  filter(n() > 1)  
sum(duplicated(data))
## [1] 0
# If Username should be unique:
sum(duplicated(data$Username))
## [1] 0
data

EDA 1.3 Quick numerical distributions

numeric_cols <- names(data)[num_vars]

par(mfrow = c(2, 3))                # 2×3 grid of histograms
for (v in numeric_cols[1:6]) {      # first six just so it’s readable
  hist(data[[v]], main = v, xlab = "")
}

par(mfrow = c(1, 1))

EDA 1.4 Plotting

plot(data, col=data$RankColour)

#plot(data$APM, data$Glicko.Rating, data$Winrate, data$PPS)
pairs(data[, c("APM", "Glicko.Rating", "Winrate", "PPS")], col=data$RankColour)

plot_data = data.frame(data$APM, data$Glicko.Rating, data$Games.Played, data$Winrate, data$PPS)
plot(plot_data, cex=0.3, pch=19, col=data$RankColour)

# Attacks per minute vs Glicko Rating
plot(data$APM, data$Glicko.Rating, cex=0.3, pch=19, col=data$RankColour)

# Attacks per minute vs Winrate
plot(data$APM, data$Winrate, cex=0.3, pch=19, col=data$RankColour)

# Attacks per minute vs Winrate
plot(data$Games.Played, data$Winrate, cex=0.3, pch=19, col=data$RankColour)

EDA 1.5 Outlier eyeball (boxplots)

boxplot(data[ , numeric_cols],
        las = 2,                    # vertical axis labels
        cex.axis = 0.7)             # shrink labels if many vars

library(tidyr)
library(ggplot2)

data_long <- data %>% 
  pivot_longer(cols = all_of(numeric_cols),
               names_to = "Variable",
               values_to = "Value")

ggplot(data_long, aes(x = Variable, y = Value)) +
  geom_boxplot(outlier.colour = "firebrick") +
  facet_wrap(~ Variable, scales = "free_y", ncol = 4) +
  theme_bw(base_size = 10) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

#Who has 4.000 wins?????????
which(data$Wins > 3000)
## [1] 21656
data[21656,]
#Who has 8.000 Games played?????????
which(data$Games.Played > 8000) #.... BINGO
## [1] 21656
# Who has the APM above 200?
which(data$APM > 200)
## [1] 1 2 4 6
data[c(1,2,4,6),]
data
which(data$Glicko.Rating > 4000)
## [1] 1 2
data[c(1,2),]
numeric_cols <- names(Filter(is.numeric, data))
#numeric_cols

big_vars <- c("Standing", "Tetra.Rating")
mid_vars <- c("Games.Played", "Wins", "Glicko.Rating")
small_vars <- setdiff(numeric_cols, c(big_vars, mid_vars))

stopifnot(all(big_vars   %in% numeric_cols),
          all(mid_vars   %in% numeric_cols),
          all(small_vars %in% numeric_cols))


par(mfrow = c(1, 3))

boxplot(data[ , big_vars],
        main = "Big scale", las = 2)

boxplot(data[ , mid_vars],
        main = "Mid scale", las = 2)

boxplot(data[ , small_vars],
        main = "Small scale", las = 2)

par(mfrow = c(1, 1))   # reset
## 1. Pick out numeric columns --------------------------------------------
numeric_cols <- names(Filter(is.numeric, data))

## 2. Choose a grid that fits them all (3 columns is usually nice) ---------
nplots <- length(numeric_cols)
ncols  <- 3
nrows  <- ceiling(nplots / ncols)

par(mfrow = c(nrows, ncols),           # grid of plots
    mar = c(4, 4, 2, 0.5))             # tighten margins a bit

## 3. Loop over the variables ---------------------------------------------
for (v in numeric_cols) {
  boxplot(data[[v]],
          main = v,
          ylab = "",                   # leave y-axis label blank
          horizontal = TRUE)           # optional: horizontal boxes
}

par(mfrow = c(1, 1))                   # restore default layout

EDA 1.6 Univariate

#skewness function
skewness = function(variable) { ( 3*(mean(variable)-median(variable)) ) / sd(variable) }

hist(data$Glicko.Rating)

skewness(data$Glicko.Rating) # 0.1152569
## [1] 0.1151184

Univariate normality test

#data_numerical = data[,c(1,4,5,6,7,8,9,10,11,12,15,17)]

#str(data)

data_numerical = data[,c(1,4,5,6,7,8,9,10,11,12,15)]

for (v in names(data_numerical)) {
  qqnorm(data_numerical[[v]],
         main = v,
         ylab = "Observed Quantiles",
         xlab = "Theoretical Quantiles")
  qqline(data_numerical[[v]], col = "red", lwd = 2)
}


EDA 1.7 Multivariate

Multivariate normality test

data_num_complete <- na.omit(data_numerical)   # avoids NA issues
n <- nrow(data_num_complete)
p <- ncol(data_num_complete)
Sx <- cov(data_numerical)
D2 <- mahalanobis(data_numerical, colMeans(data_numerical), Sx)

#Theoretical χ² quantiles
chi_q <- qchisq(ppoints(n, a = 0.5), df = p)

# Chi-square Q-Q plot (multivariate normality)
qqplot(qchisq(ppoints(n, a = 0.5), df = p), D2,
       ylab = "Mahalanobis Distance",
       xlab = bquote("quantiles of " ~ chi[.(p)]^2))

abline(0, 1, col = "red", lwd = 2)

title(main = "Mahalanobis D²", font.main = 2)

A comparison of the Mahalanobis D^2 distances with the χ_12^2 reference line shows a pronounced upward curvature and many large outliers, indicating that the joint distribution of the 12 numeric variables deviates substantially from multivariate normality.

#APM vs PPS

#APM vs PPS
plot(data$PPS, data$APM, cex=0.5, pch=19, col=data$RankColour,
     xlab="Pieces Per Second", ylab="Attack Per Minute",
     main="APM vs PPS, Coloured by Rank")
linearFit = lm(data$APM ~ data$PPS)
abline(linearFit, lwd=2)

summary(linearFit)
## 
## Call:
## lm(formula = data$APM ~ data$PPS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.474  -5.498   0.131   5.479  75.780 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) -28.8916     0.1417  -203.9 <0.0000000000000002 ***
## data$PPS     47.2307     0.1061   445.2 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.454 on 39762 degrees of freedom
## Multiple R-squared:  0.8329, Adjusted R-squared:  0.8329 
## F-statistic: 1.982e+05 on 1 and 39762 DF,  p-value: < 0.00000000000000022
par(mfrow=c(2,2))
plot(linearFit, cex=0.5, pch=19, col=data$RankColour)

Supporter by Rank

#Supporter by Rank
table       = table(data$Supporter.Status., data$Rank)
proportions = proportions(table, margin=2)
#par(mfrow=c(2,1))
barplot(table, xlab="Rank", ylab="Frequency", col=c("#4d4d4d", "orange"),
        main="Frequency of Supporters & Non-Supporters by Rank")
#box()
legend("topright", legend=c("Supporter", "Not Supporter"), col=c("orange", "#4d4d4d"), pch=15)

barplot(proportions, xlab="Rank", ylab="Proportion", col=c("#4d4d4d", "orange"),
        main="Proportion of Supporters & Non-Supporters by Rank")

#box()

### Correlation

#help(cor)

#data$Country = as.factor(data$Country)
data$Rank_number = as.numeric(data$Rank)
#data$Rank_number

#str(data)


data_correlation = cor(data[,c(1,4,5,6,7,8,9,10,11,12,14,15,17)])
data_correlation
##                     Standing       Wins Games.Played     Winrate        APM
## Standing           1.0000000 -0.4408111  -0.41821733 -0.60516544 -0.8390488
## Wins              -0.4408111  1.0000000   0.99892120  0.15088807  0.3844931
## Games.Played      -0.4182173  0.9989212   1.00000000  0.12782667  0.3583682
## Winrate           -0.6051654  0.1508881   0.12782667  1.00000000  0.4477224
## APM               -0.8390488  0.3844931   0.35836818  0.44772242  1.0000000
## PPS               -0.8448133  0.4024040   0.38002884  0.46739914  0.9126476
## VS                -0.8582807  0.3914379   0.36553775  0.45494757  0.9949365
## Glicko.Rating     -0.9602501  0.4219243   0.39557370  0.58659481  0.9241072
## Rating.Deviation   0.2637014 -0.3751242  -0.37069935 -0.19753213 -0.1982860
## Tetra.Rating      -0.9955876  0.4431432   0.41968178  0.58239296  0.8820971
## Active.This.Week  -0.1439440  0.2090427   0.20694240  0.03169982  0.1364815
## Supporter.Status. -0.1665577  0.1059904   0.09762496  0.09728786  0.2360381
## Rank_number       -0.9944410  0.4403577   0.41705449  0.63463993  0.8527989
##                          PPS         VS Glicko.Rating Rating.Deviation
## Standing          -0.8448133 -0.8582807    -0.9602501       0.26370138
## Wins               0.4024040  0.3914379     0.4219243      -0.37512420
## Games.Played       0.3800288  0.3655377     0.3955737      -0.37069935
## Winrate            0.4673991  0.4549476     0.5865948      -0.19753213
## APM                0.9126476  0.9949365     0.9241072      -0.19828605
## PPS                1.0000000  0.9147989     0.8973381      -0.21256242
## VS                 0.9147989  1.0000000     0.9390499      -0.20462546
## Glicko.Rating      0.8973381  0.9390499     1.0000000      -0.22153297
## Rating.Deviation  -0.2125624 -0.2046255    -0.2215330       1.00000000
## Tetra.Rating       0.8731348  0.8999056     0.9749465      -0.25957441
## Active.This.Week   0.1421478  0.1409972     0.1455130      -0.50978822
## Supporter.Status.  0.2103876  0.2324646     0.2037424      -0.05309128
## Rank_number        0.8541937  0.8712542     0.9680742      -0.26893159
##                   Tetra.Rating Active.This.Week Supporter.Status. Rank_number
## Standing            -0.9955876      -0.14394401       -0.16655774  -0.9944410
## Wins                 0.4431432       0.20904274        0.10599042   0.4403577
## Games.Played         0.4196818       0.20694240        0.09762496   0.4170545
## Winrate              0.5823930       0.03169982        0.09728786   0.6346399
## APM                  0.8820971       0.13648146        0.23603810   0.8527989
## PPS                  0.8731348       0.14214783        0.21038758   0.8541937
## VS                   0.8999056       0.14099717        0.23246457   0.8712542
## Glicko.Rating        0.9749465       0.14551299        0.20374241   0.9680742
## Rating.Deviation    -0.2595744      -0.50978822       -0.05309128  -0.2689316
## Tetra.Rating         1.0000000       0.14632194        0.18145251   0.9925282
## Active.This.Week     0.1463219       1.00000000        0.04277167   0.1428549
## Supporter.Status.    0.1814525       0.04277167        1.00000000   0.1727455
## Rank_number          0.9925282       0.14285490        0.17274551   1.0000000
#which(abs(data_correlation) > 0.90)


#data[,-c(1,15,16,28,29,43,57.58)]

Highly Correlated variables: * Standing or Tetra.Rating * Games played or Wins * APM, PPS, VS, Glicko Rating * Rank_number

# The ones ADAM think should ve used
linearFit2 = lm(data$Glicko.Rating ~ data$Games.Played + data$APM + data$PPS + data$VS + data$Rating.Deviation + data$Active.This.Week + data$Supporter.Status.)

summary(linearFit2)
## 
## Call:
## lm(formula = data$Glicko.Rating ~ data$Games.Played + data$APM + 
##     data$PPS + data$VS + data$Rating.Deviation + data$Active.This.Week + 
##     data$Supporter.Status.)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -805.82  -90.05   23.08  103.27  403.73 
## 
## Coefficients:
##                          Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)            764.606064   6.505819 117.526 < 0.0000000000000002 ***
## data$Games.Played        0.041823   0.001887  22.166 < 0.0000000000000002 ***
## data$APM               -20.144351   0.301810 -66.745 < 0.0000000000000002 ***
## data$PPS               239.251542   3.902719  61.304 < 0.0000000000000002 ***
## data$VS                 16.347619   0.147828 110.585 < 0.0000000000000002 ***
## data$Rating.Deviation   -0.301614   0.066845  -4.512        0.00000643577 ***
## data$Active.This.Week   -3.830898   1.640073  -2.336               0.0195 *  
## data$Supporter.Status. -27.927844   4.412990  -6.329        0.00000000025 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 139 on 39756 degrees of freedom
## Multiple R-squared:  0.9039, Adjusted R-squared:  0.9039 
## F-statistic: 5.344e+04 on 7 and 39756 DF,  p-value: < 0.00000000000000022
# The ones we keep based on correlation
linearFit3 = lm(data$Glicko.Rating ~ data$Games.Played + data$VS + data$Rating.Deviation + data$Active.This.Week + data$Supporter.Status.)

summary(linearFit3)
## 
## Call:
## lm(formula = data$Glicko.Rating ~ data$Games.Played + data$VS + 
##     data$Rating.Deviation + data$Active.This.Week + data$Supporter.Status.)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -799.63  -97.96   31.41  115.37  363.26 
## 
## Coefficients:
##                          Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)            959.824488   6.367757 150.732 < 0.0000000000000002 ***
## data$Games.Played        0.059467   0.002047  29.057 < 0.0000000000000002 ***
## data$VS                  8.634738   0.017591 490.851 < 0.0000000000000002 ***
## data$Rating.Deviation   -0.516538   0.072963  -7.079     0.00000000000147 ***
## data$Active.This.Week   -2.339120   1.791228  -1.306                0.192    
## data$Supporter.Status. -44.646927   4.813891  -9.275 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 151.9 on 39758 degrees of freedom
## Multiple R-squared:  0.8854, Adjusted R-squared:  0.8854 
## F-statistic: 6.142e+04 on 5 and 39758 DF,  p-value: < 0.00000000000000022

Trying to estimate games playe

#here we try to predicto or estimate, a players number of games played
#linearFit4 = lm(data$Games.Played ~ data$Standing + data$Winrate + data$APM + data$PPS + data$VS  + data$Glicko.Rating + data$Rating.Deviation + data$Tetra.Rating + data$Active.This.Week + data$Supporter.Status. + data$Rank_number)

linearFit4 = lm(data$Games.Played ~ data$APM + data$PPS + data$VS  + data$Glicko.Rating + data$Rating.Deviation + data$Active.This.Week + data$Supporter.Status.)

summary(linearFit4)
## 
## Call:
## lm(formula = data$Games.Played ~ data$APM + data$PPS + data$VS + 
##     data$Glicko.Rating + data$Rating.Deviation + data$Active.This.Week + 
##     data$Supporter.Status.)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -900.5 -199.3  -67.5   92.2 7627.3 
## 
## Coefficients:
##                         Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)            469.30997   19.81152  23.689 < 0.0000000000000002 ***
## data$APM                -1.95136    0.84076  -2.321             0.020295 *  
## data$PPS               154.28421   10.75888  14.340 < 0.0000000000000002 ***
## data$VS                 -0.29097    0.44658  -0.652             0.514697    
## data$Glicko.Rating       0.29189    0.01317  22.166 < 0.0000000000000002 ***
## data$Rating.Deviation   -9.56107    0.17000 -56.240 < 0.0000000000000002 ***
## data$Active.This.Week    8.17218    4.33291   1.886             0.059293 .  
## data$Supporter.Status.  44.88804   11.66210   3.849             0.000119 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 367.3 on 39756 degrees of freedom
## Multiple R-squared:  0.2452, Adjusted R-squared:  0.2451 
## F-statistic:  1845 on 7 and 39756 DF,  p-value: < 0.00000000000000022
#Supporter
linearFit5 = lm(data$Supporter.Status. ~ data$APM + data$PPS + data$VS  + data$Glicko.Rating + data$Rating.Deviation + data$Active.This.Week)

summary(linearFit5)
## 
## Call:
## lm(formula = data$Supporter.Status. ~ data$APM + data$PPS + data$VS + 
##     data$Glicko.Rating + data$Rating.Deviation + data$Active.This.Week)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32036 -0.03240 -0.01116 -0.00247  1.00981 
## 
## Coefficients:
##                           Estimate   Std. Error t value           Pr(>|t|)    
## (Intercept)            0.012688039  0.008519656   1.489             0.1364    
## data$APM               0.002697576  0.000361315   7.466 0.0000000000000844 ***
## data$PPS               0.000116067  0.004626828   0.025             0.9800    
## data$VS               -0.000220984  0.000192048  -1.151             0.2499    
## data$Glicko.Rating    -0.000033626  0.000005661  -5.940 0.0000000028680277 ***
## data$Rating.Deviation -0.000071420  0.000073109  -0.977             0.3286    
## data$Active.This.Week  0.003342003  0.001863281   1.794             0.0729 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.158 on 39757 degrees of freedom
## Multiple R-squared:  0.05734,    Adjusted R-squared:  0.0572 
## F-statistic: 403.1 on 6 and 39757 DF,  p-value: < 0.00000000000000022

Business Intelligence

BI 2.1 - Ranking UP

#boxplots all ranks apm
boxplot(data$APM ~ data$Rank)

#calculate average stats for each rank
avgAPMs = tapply(data$APM, data$Rank, mean)
avgPPSs = tapply(data$PPS, data$Rank, mean)
avgVSs  = tapply(data$VS,  data$Rank, mean)
par(mfrow=c(1,3))
barplot(avgAPMs, main="avgAPMs by Rank")
barplot(avgPPSs, main="avgPPSs by Rank")
barplot(avgVSs,  main="avgVSs by Rank")

par(mfrow=c(1,1))

BI 2.2 - Finding more supporters


BI 2.3 - Maintaining Competitive Integrity

#find outlier in B rank
bRanks = data[which(data$Rank == "B"),]
plot(bRanks$Standing, bRanks$APM, cex=1, pch=19, col=bRanks$RankColour,
     xlab="Standing", ylab="Attack Per Minute", main="APM of all Player in B Rank")

data[which(data$Rank == "B" & data$APM > 40),] #users: KAMYI & CRISSELLE